Disclaimer

The author of this page is not an epidemiologist. All data visualizations presented here are for code demonstration purposes only and are not meant to be a substitute for public health information. For current information we recommend: the JHU Coronavirus dashboard and Nextstrain

The Raw Data

Download GitHub Repo

The raw data compiled by Johns Hopkins University Center for Systems Science and Engineering and used in online dashboards tracking the virus spread are publicly available in a GitHub repository.

We will clone the entire repository and access the data files within.

rm -R COVID-19
system('git clone https://github.com/CSSEGISandData/COVID-19')
list.files('COVID-19')
## [1] "archived_data"                  "csse_covid_19_data"            
## [3] "README.md"                      "who_covid_19_situation_reports"

Explore

Inside this folder we have the current data from various government and international organizations tracking the spread of the Coronavirus. We will mostly use the ‘csse_covid_19_data’ which compiles the infection and death rates.

list.files('COVID-19/csse_covid_19_data/csse_covid_19_time_series')
## [1] "README.md"                          "time_series_19-covid-Confirmed.csv"
## [3] "time_series_19-covid-Deaths.csv"    "time_series_19-covid-Recovered.csv"

Visualize

Read

Read case trends file:

cases = read.csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')
colnames(cases)
##  [1] "Province.State" "Country.Region" "Lat"            "Long"          
##  [5] "X1.22.20"       "X1.23.20"       "X1.24.20"       "X1.25.20"      
##  [9] "X1.26.20"       "X1.27.20"       "X1.28.20"       "X1.29.20"      
## [13] "X1.30.20"       "X1.31.20"       "X2.1.20"        "X2.2.20"       
## [17] "X2.3.20"        "X2.4.20"        "X2.5.20"        "X2.6.20"       
## [21] "X2.7.20"        "X2.8.20"        "X2.9.20"        "X2.10.20"      
## [25] "X2.11.20"       "X2.12.20"       "X2.13.20"       "X2.14.20"      
## [29] "X2.15.20"       "X2.16.20"       "X2.17.20"       "X2.18.20"      
## [33] "X2.19.20"       "X2.20.20"       "X2.21.20"       "X2.22.20"      
## [37] "X2.23.20"       "X2.24.20"       "X2.25.20"       "X2.26.20"      
## [41] "X2.27.20"       "X2.28.20"       "X2.29.20"       "X3.1.20"       
## [45] "X3.2.20"        "X3.3.20"        "X3.4.20"        "X3.5.20"       
## [49] "X3.6.20"        "X3.7.20"        "X3.8.20"        "X3.9.20"       
## [53] "X3.10.20"       "X3.11.20"       "X3.12.20"       "X3.13.20"

Read death trends file:

deaths = read.csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv')
colnames(deaths)
##  [1] "Province.State" "Country.Region" "Lat"            "Long"          
##  [5] "X1.22.20"       "X1.23.20"       "X1.24.20"       "X1.25.20"      
##  [9] "X1.26.20"       "X1.27.20"       "X1.28.20"       "X1.29.20"      
## [13] "X1.30.20"       "X1.31.20"       "X2.1.20"        "X2.2.20"       
## [17] "X2.3.20"        "X2.4.20"        "X2.5.20"        "X2.6.20"       
## [21] "X2.7.20"        "X2.8.20"        "X2.9.20"        "X2.10.20"      
## [25] "X2.11.20"       "X2.12.20"       "X2.13.20"       "X2.14.20"      
## [29] "X2.15.20"       "X2.16.20"       "X2.17.20"       "X2.18.20"      
## [33] "X2.19.20"       "X2.20.20"       "X2.21.20"       "X2.22.20"      
## [37] "X2.23.20"       "X2.24.20"       "X2.25.20"       "X2.26.20"      
## [41] "X2.27.20"       "X2.28.20"       "X2.29.20"       "X3.1.20"       
## [45] "X3.2.20"        "X3.3.20"        "X3.4.20"        "X3.5.20"       
## [49] "X3.6.20"        "X3.7.20"        "X3.8.20"        "X3.9.20"       
## [53] "X3.10.20"       "X3.11.20"       "X3.12.20"       "X3.13.20"

These tables are organized with one column for each date that they have tracked data. The R packages dplyr and ggplot2 do not work with this formatting exacly. Instead we need to rearrange so that each row contains only one value for cases/deaths and information in other columns about the geography and date that value applies to.

library(reshape2)
cases_df = melt(cases, 
               id.vars=c('Province.State', 'Country.Region', 'Lat', 'Long'), 
               measure.vars = grep('X', colnames(cases), value=T)
               )

deaths_df = melt(deaths, 
               id.vars=c('Province.State', 'Country.Region', 'Lat', 'Long'), 
               measure.vars = grep('X', colnames(cases), value=T)
               )

Filter with dplyr

How would you use dplyr to filter for cases (or deaths) by geography (country and state)?

#Get data for US
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
cases_notnull = cases_df %>% 
  group_by(Country.Region, variable) %>%
  summarize(total_cases=sum(value))

us_cases = cases_notnull %>% 
  filter(Country.Region == 'US') 

#Get data for US cases in Massachusetts
ma_cases = cases_df %>% 
  filter(Province.State == 'Massachusetts') %>%
  group_by(Country.Region, variable) %>%
  summarize(total_cases=sum(value))

Visualize

Plot US case trend

library(ggplot2)

ggplot(data=cases_notnull) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
  theme_minimal() +
  theme(legend.position = 'none')

And again, but this time on a log scale so we can see the smaller lines.

ggplot(data=cases_notnull) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
  scale_y_log10() +
  theme_minimal() +
  theme(legend.position = 'none')
## Warning: Transformation introduced infinite values in continuous y-axis

What about the cases in Massachusetts only?

ggplot(ma_cases) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
  theme_minimal() +
  theme(legend.position = 'none')

Interactive Charts

Try plotly:

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#create ggplot object
g1 = ggplot(data=cases_notnull %>% ungroup()) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
  theme_minimal() +
  theme(legend.position = 'none')

#convert interactive plotly
ggplotly()

plotly plot for MA cases only:

library(plotly)
#create ggplot object
g1 = ggplot(data=ma_cases %>% ungroup()) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
  theme_minimal() +
  theme(legend.position = 'none')

#convert interactive plotly
ggplotly()

Cumulative Infections Script

The graphs above show trends in active infection cases over time. Below is a self contained experimental R script to generate cumulative infection tables and graphs.

#!/usr/bin/R
library(dplyr)
library(tibble)
library(ggplot2)
library(reshape2)

#get data
system('git clone https://github.com/CSSEGISandData/COVID-19')

#load cases, deaths, recovered data
cases = read.csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')

cases_df = melt(cases, 
               id.vars=c('Province.State', 'Country.Region', 'Lat', 'Long'), 
               measure.vars = grep('X', colnames(cases), value=T)
               )

cases_notnull = cases_df %>%
  group_by(Country.Region, variable) %>%
  summarize(total_cases=sum(value))


(g1 = ggplot(data=cases_notnull) +
  geom_path(aes(x=variable, 
                y=total_cases, 
                group=Country.Region, 
                col=Country.Region)
            ) +
    theme(legend.position = 'none')
)

g1 + ylim(0,10000)
## Warning: Removed 48 rows containing missing values (geom_path).